import altair as alt
import pandas as pd
import ppscore as pps
from pycaret.regression import *
from ydata_profiling import ProfileReport
# customize Altair
def y_axis():
return {
"config": {
"axisX": {"grid": False},
"axisY": {
"domain": False,
"gridDash": [2, 4],
"tickSize": 0,
"titleAlign": "right",
"titleAngle": 0,
"titleX": -5,
"titleY": -10,
},
"view": {
"stroke": "transparent",
# To keep the same height and width as the default theme:
"continuousHeight": 300,
"continuousWidth": 400,
},
}
}
alt.themes.register("y_axis", y_axis)
alt.themes.enable("y_axis")
def get_descriptions():
"Parse descriptions of columns of Ames Housing dataset"
with open("data_description.txt") as reader:
descriptions = {}
for line in reader.readlines():
if ":" in line and "2nd level" not in line:
descriptions[line.split(": ")[0].strip()] = line.split(": ")[1].strip()
return pd.Series(descriptions).rename("descriptions")
descriptions = get_descriptions()Predicting house prices in Ames, Iowa

Objectives
- Example end-to-end supervised learning workflow with Ames Housing dataset
- Focus on conceptual understanding of machine learning
- Demonstrate use of Predictive Power Score (PPS)
- Demonstrate capabilities of low-code tools
Attribution
Dataset
- Ames Housing dataset paper (original paper)
- Kaggle competition advanced regression techniques (link)
Python libraries
- Altair (docs)
- ydata-profiling (docs)
- Predictive Power Score (PPS, GitHub, blog)
- PyCaret: open-source, low-code machine learning library in Python that automates machine learning workflows (link)
Read and explore the data
%%time
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
profile = ProfileReport(train, minimal=True, title="Ames Housing Profiling Report")
profile.to_file("ames-housing-profiling-report-minimal.html")CPU times: user 45 s, sys: 1.78 s, total: 46.7 s
Wall time: 16.4 s
profile.to_notebook_iframe()Investigate features with largest predictive power
We use the Predictive Power Score to evaluate which features have the highest predictive power with respect to SalePrice.
predictors = (
pps.predictors(train, "SalePrice")
.round(3)
.iloc[:, :-1]
.merge(descriptions, how="left", left_on="x", right_index=True)
)
base = (
alt.Chart(predictors)
.encode(
x=alt.Y("x:N").sort("-y"),
y="ppscore",
tooltip=["x", "ppscore", "descriptions"],
)
.transform_filter("datum.ppscore > 0")
)
base.mark_bar() + base.mark_text(align="center", dy=-5)Investigate colinearity
pps_matrix = (
pps.matrix(
train.loc[:, predictors.query("ppscore > 0")["x"].tolist()],
)
.loc[:, ["x", "y", "ppscore"]]
.round(3)
)
(
alt.Chart(pps_matrix)
.mark_rect()
.encode(
x="x:O",
y="y:O",
color="ppscore:Q",
tooltip=["x", "y", "ppscore"])
)Build models
We select the 30 features that have the highest predictive power score
selected_predictors = (
predictors.sort_values("ppscore", ascending=False).head(30)["x"].to_list()
)
reg = setup(data = train.loc[:, selected_predictors + ["SalePrice"]],
target = 'SalePrice',
numeric_imputation = 'mean',
categorical_features = list(train.loc[:, selected_predictors].select_dtypes("object").columns),
feature_selection = False,
pca=False,
remove_multicollinearity=True,
remove_outliers = False,
normalize = True,
)| Description | Value | |
|---|---|---|
| 0 | Session id | 8378 |
| 1 | Target | SalePrice |
| 2 | Target type | Regression |
| 3 | Original data shape | (1460, 31) |
| 4 | Transformed data shape | (1460, 116) |
| 5 | Transformed train set shape | (1021, 116) |
| 6 | Transformed test set shape | (439, 116) |
| 7 | Ordinal features | 1 |
| 8 | Numeric features | 16 |
| 9 | Categorical features | 14 |
| 10 | Rows with missing values | 94.7% |
| 11 | Preprocess | True |
| 12 | Imputation type | simple |
| 13 | Numeric imputation | mean |
| 14 | Categorical imputation | mode |
| 15 | Maximum one-hot encoding | 25 |
| 16 | Encoding method | None |
| 17 | Remove multicollinearity | True |
| 18 | Multicollinearity threshold | 0.900000 |
| 19 | Normalize | True |
| 20 | Normalize method | zscore |
| 21 | Fold Generator | KFold |
| 22 | Fold Number | 10 |
| 23 | CPU Jobs | -1 |
| 24 | Use GPU | False |
| 25 | Log Experiment | False |
| 26 | Experiment Name | reg-default-name |
| 27 | USI | 81f6 |
%%time
selected_models = [model for model in models().index if model not in ["lar", "lr", "ransac"]]
best_model = compare_models(sort='RMSLE', include=selected_models)| Model | MAE | MSE | RMSE | R2 | RMSLE | MAPE | TT (Sec) | |
|---|---|---|---|---|---|---|---|---|
| lightgbm | Light Gradient Boosting Machine | 18267.8967 | 969345616.0929 | 30245.0381 | 0.8400 | 0.1474 | 0.1051 | 0.3780 |
| gbr | Gradient Boosting Regressor | 18349.4461 | 1064907228.1139 | 31464.4286 | 0.8221 | 0.1497 | 0.1059 | 0.0810 |
| rf | Random Forest Regressor | 18834.6022 | 1052157810.7295 | 31669.8884 | 0.8263 | 0.1530 | 0.1091 | 0.1370 |
| par | Passive Aggressive Regressor | 18695.3332 | 1145943934.1128 | 32527.7429 | 0.8093 | 0.1535 | 0.1061 | 0.0560 |
| en | Elastic Net | 19941.1185 | 1212771199.2709 | 33679.9238 | 0.8018 | 0.1536 | 0.1131 | 0.0370 |
| et | Extra Trees Regressor | 19749.8604 | 1158574471.9795 | 33510.6172 | 0.8073 | 0.1591 | 0.1138 | 0.1370 |
| huber | Huber Regressor | 18580.7407 | 1172797296.1965 | 32571.8573 | 0.8024 | 0.1602 | 0.1069 | 0.0420 |
| br | Bayesian Ridge | 20557.3468 | 1251454965.3245 | 34036.3809 | 0.7934 | 0.1715 | 0.1191 | 0.0380 |
| ard | Automatic Relevance Determination | 20446.5401 | 1229331466.4696 | 33711.3986 | 0.7969 | 0.1747 | 0.1193 | 0.2740 |
| omp | Orthogonal Matching Pursuit | 21882.7966 | 1294135379.9217 | 34955.8947 | 0.7847 | 0.1849 | 0.1296 | 0.0340 |
| ada | AdaBoost Regressor | 24866.3282 | 1379609584.9159 | 36498.7175 | 0.7707 | 0.2036 | 0.1621 | 0.0580 |
| knn | K Neighbors Regressor | 26571.2016 | 1730405638.7521 | 40931.3774 | 0.7200 | 0.2050 | 0.1518 | 0.0360 |
| dt | Decision Tree Regressor | 27747.5148 | 2157234242.4490 | 45330.0191 | 0.6512 | 0.2169 | 0.1564 | 0.0350 |
| llar | Lasso Least Angle Regression | 21458.2025 | 1320695830.3446 | 35006.4301 | 0.7809 | 0.2187 | 0.1268 | 0.0380 |
| lasso | Lasso Regression | 21455.6793 | 1320742178.3808 | 35006.1951 | 0.7809 | 0.2189 | 0.1268 | 0.2100 |
| ridge | Ridge Regression | 21439.2241 | 1318937040.3720 | 34981.0548 | 0.7812 | 0.2196 | 0.1266 | 0.0360 |
| svm | Support Vector Regression | 55543.3805 | 6417749387.4994 | 79739.3850 | -0.0524 | 0.3979 | 0.3195 | 0.0450 |
| dummy | Dummy Regressor | 57352.4774 | 6133919031.8184 | 78021.7431 | -0.0086 | 0.4061 | 0.3635 | 0.0340 |
| tr | TheilSen Regressor | 29178.3219 | 2564758742.0908 | 49572.3895 | 0.5667 | 0.4258 | 0.1978 | 4.0290 |
| kr | Kernel Ridge | 182040.0692 | 34133507087.4154 | 184731.2672 | -4.7500 | 1.7994 | 1.1623 | 0.0380 |
| mlp | MLP Regressor | 166456.5847 | 32851392125.7179 | 181040.0031 | -4.4796 | 2.7703 | 0.9182 | 0.2890 |
CPU times: user 4.09 s, sys: 446 ms, total: 4.53 s
Wall time: 1min 3s
Evaluation
- With a standard, AutoML-like workflow, we achive RMSLE of 0.13 - 0.14 (over different runs), which is already in the top 25% of the 4,200 submissions on the leaderboard
- We can now make predictions on the test set
predictions = (
predict_model(best_model, data=test)
.rename(columns={"prediction_label": "SalePrice"})
.loc[:, ["Id", "SalePrice"]]
)
predictions.head()| Id | SalePrice | |
|---|---|---|
| 0 | 1461 | 126951.931078 |
| 1 | 1462 | 142402.002648 |
| 2 | 1463 | 185086.014955 |
| 3 | 1464 | 191718.590497 |
| 4 | 1465 | 186412.972060 |
Pipeline
plot_model(best_model, 'pipeline')
plot_model(best_model, 'feature')
plot_model(best_model, 'residuals')